{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "296960d3-8cec-4a8e-b0fd-8db8f0dce2f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "from requests.adapters import HTTPAdapter, Retry\n",
    "import pandas as pd\n",
    "from urllib.parse import quote\n",
    "import re\n",
    "import time\n",
    "from datetime import datetime\n",
    "from calendar import monthrange\n",
    "\n",
    "# =========================================================\n",
    "# 0) User config\n",
    "# =========================================================\n",
    "KEYWORDS = [\"원자력 발전\", \"원전\"] \n",
    "QUERY = \" OR \".join(KEYWORDS)\n",
    "\n",
    "START_YM = \"2010-01\"\n",
    "END_YM   = \"2025-12\"\n",
    "\n",
    "NATIONAL_TARGET_PER_MONTH = 100\n",
    "LOCAL_TARGET_PER_MONTH    = 100\n",
    "\n",
    "MAX_SEARCH_PAGES_PER_MONTH = 80   # safety cap; each page ~10 results (start=1,11,21...)\n",
    "SLEEP_BETWEEN_REQUESTS = 0.4\n",
    "SLEEP_BETWEEN_PAGES = 0.8\n",
    "\n",
    "OUT_XLSX = \"naver_monthly_sample_2010_2025.xlsx\"\n",
    "\n",
    "# =========================================================\n",
    "# 1) Outlet lists (EDIT THESE TO MATCH NAVER PRESS LABELS)\n",
    "#    Use exactly the press names as shown on NAVER results.\n",
    "# =========================================================\n",
    "NATIONAL_OUTLETS = {\n",
    "    \"조선일보\", \"중앙일보\", \"동아일보\", \"한겨레\", \"경향신문\", \"한국일보\",\n",
    "    “국민일보\", “서울신문”, “세계일보”, “문화일보”, “오마이뉴스”, “미디어오늘”\n",
    "}\n",
    "\n",
    "# Local outlets: include only regions hosting / adjacent to nuclear sites (as in manuscript)\n",
    "LOCAL_OUTLETS = {\n",
    "    \"부산일보\", \"국제신문\", \"울산매일\", \"경남신문\", “경남일보”,\"경북일보\",\n",
    "    \"전북일보\", “전북의소리”, “광주매일신문”, \"전남일보\", \"광주일보\", \"무등일보\"\n",
    "}\n",
    "\n",
    "# For ideology robustness later, also keep these sets:\n",
    "NATIONAL_CONSERVATIVE = {\"조선일보\", \"중앙일보\", \"동아일보\"}  # used in robustness\n",
    "# NATIONAL_OTHERS = NATIONAL_OUTLETS - NATIONAL_CONSERVATIVE  # optional\n",
    "\n",
    "# =========================================================\n",
    "# 2) HTTP session with retries\n",
    "# =========================================================\n",
    "def make_session():\n",
    "    s = requests.Session()\n",
    "    retries = Retry(\n",
    "        total=6,\n",
    "        backoff_factor=0.7,\n",
    "        status_forcelist=[429, 500, 502, 503, 504],\n",
    "        allowed_methods=[\"GET\"],\n",
    "        raise_on_status=False,\n",
    "    )\n",
    "    s.mount(\"https://\", HTTPAdapter(max_retries=retries))\n",
    "    s.mount(\"http://\", HTTPAdapter(max_retries=retries))\n",
    "    s.headers.update({\n",
    "        \"User-Agent\": (\n",
    "            \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) \"\n",
    "            \"AppleWebKit/537.36 (KHTML, like Gecko) \"\n",
    "            \"Chrome/121.0.0.0 Safari/537.36\"\n",
    "        )\n",
    "    })\n",
    "    return s\n",
    "\n",
    "def clean_text(txt: str) -> str:\n",
    "    if not txt:\n",
    "        return \"\"\n",
    "    return re.sub(r\"\\s+\", \" \", txt).strip()\n",
    "\n",
    "def pick_article_url(info_links):\n",
    "    \"\"\"\n",
    "    Prefer NAVER-hosted news page (n.news.naver.com).\n",
    "    \"\"\"\n",
    "    hrefs = [a.get(\"href\", \"\") for a in info_links if a.get(\"href\")]\n",
    "    for h in hrefs:\n",
    "        if \"n.news.naver.com\" in h:\n",
    "            return h\n",
    "    # fallback\n",
    "    for h in hrefs:\n",
    "        if h:\n",
    "            return h\n",
    "    return None\n",
    "\n",
    "def parse_article(session, url):\n",
    "    \"\"\"\n",
    "    Parse NAVER news pages:\n",
    "    returns: title, content, date_ym, date_full (YYYY-MM-DD), time_full (optional)\n",
    "    \"\"\"\n",
    "    r = session.get(url, timeout=12)\n",
    "    if r.status_code != 200:\n",
    "        return \"\", \"\", \"\", \"\", \"\"\n",
    "\n",
    "    soup = BeautifulSoup(r.text, \"html.parser\")\n",
    "\n",
    "    # Title\n",
    "    title_el = (\n",
    "        soup.select_one(\"h2#title_area\") or\n",
    "        soup.select_one(\"h2.media_end_head_headline\") or\n",
    "        soup.select_one(\"h1\")\n",
    "    )\n",
    "\n",
    "    # Content\n",
    "    content_el = (\n",
    "        soup.select_one(\"div#dic_area\") or\n",
    "        soup.select_one(\"div#newsct_article\") or\n",
    "        soup.select_one(\"div#articleBodyContents\") or\n",
    "        soup.select_one(\"div#articeBody\")\n",
    "    )\n",
    "\n",
    "    # Date/time\n",
    "    date_el = (\n",
    "        soup.select_one(\"span.media_end_head_info_datestamp_time\") or\n",
    "        soup.select_one(\"span._ARTICLE_DATE_TIME\") or\n",
    "        soup.select_one(\"span.t11\")\n",
    "    )\n",
    "\n",
    "    title = clean_text(title_el.get_text(\" \", strip=True)) if title_el else \"\"\n",
    "    content = clean_text(content_el.get_text(\" \", strip=True)) if content_el else \"\"\n",
    "    date_txt = clean_text(date_el.get_text(\" \", strip=True)) if date_el else \"\"\n",
    "\n",
    "    # Extract YYYY-MM-DD and HH:MM if present\n",
    "    # Common formats: 2021.03.12. 오전 10:22 / 2021-03-12 10:22 / etc.\n",
    "    ymd = \"\"\n",
    "    hm = \"\"\n",
    "    m1 = re.search(r\"(20\\d{2})[.\\-/년 ]\\s*(\\d{1,2})[.\\-/월 ]\\s*(\\d{1,2})\", date_txt)\n",
    "    if m1:\n",
    "        y = int(m1.group(1)); mo = int(m1.group(2)); d = int(m1.group(3))\n",
    "        ymd = f\"{y:04d}-{mo:02d}-{d:02d}\"\n",
    "\n",
    "    m2 = re.search(r\"(\\d{1,2}):(\\d{2})\", date_txt)\n",
    "    if m2:\n",
    "        hm = f\"{int(m2.group(1)):02d}:{int(m2.group(2)):02d}\"\n",
    "\n",
    "    ym = ymd[:7] if ymd else \"\"\n",
    "\n",
    "    return title, content, ym, ymd, hm\n",
    "\n",
    "def ym_to_datestr(ym: str):\n",
    "    \"\"\"\n",
    "    ym: 'YYYY-MM' -> ds 'YYYY.MM.DD', de 'YYYY.MM.DD'\n",
    "    \"\"\"\n",
    "    y, m = map(int, ym.split(\"-\"))\n",
    "    last_day = monthrange(y, m)[1]\n",
    "    ds = f\"{y:04d}.{m:02d}.01\"\n",
    "    de = f\"{y:04d}.{m:02d}.{last_day:02d}\"\n",
    "    return ds, de\n",
    "\n",
    "def iter_months(start_ym: str, end_ym: str):\n",
    "    start = datetime.strptime(start_ym, \"%Y-%m\")\n",
    "    end = datetime.strptime(end_ym, \"%Y-%m\")\n",
    "    months = []\n",
    "    cur = start\n",
    "    while cur <= end:\n",
    "        months.append(cur.strftime(\"%Y-%m\"))\n",
    "        # add one month\n",
    "        y = cur.year + (cur.month // 12)\n",
    "        m = (cur.month % 12) + 1\n",
    "        cur = cur.replace(year=y, month=m)\n",
    "    return months\n",
    "\n",
    "# =========================================================\n",
    "# 3) Main scraping loop\n",
    "# =========================================================\n",
    "session = make_session()\n",
    "\n",
    "all_rows = []\n",
    "global_seen = set()\n",
    "\n",
    "months = iter_months(START_YM, END_YM)\n",
    "\n",
    "for ym in months:\n",
    "    ds, de = ym_to_datestr(ym)\n",
    "\n",
    "    # Monthly quotas\n",
    "    nat_rows = []\n",
    "    loc_rows = []\n",
    "    seen_month = set()\n",
    "\n",
    "    # NAVER search URL template (date filters per month)\n",
    "    q = quote(QUERY)\n",
    "    base_search = (\n",
    "        \"https://search.naver.com/search.naver\"\n",
    "        f\"?where=news&sm=tab_pge&query={q}\"\n",
    "        \"&sort=0&photo=0&field=0&pd=3\"\n",
    "        f\"&ds={ds}&de={de}\"\n",
    "        f\"&nso=so:r,p:from{ds.replace('.','')}to{de.replace('.','')},a:all\"\n",
    "        \"&mynews=1\"   # more stable grouping; optional\n",
    "    )\n",
    "\n",
    "    print(f\"\\n[{ym}] target: National={NATIONAL_TARGET_PER_MONTH}, Local={LOCAL_TARGET_PER_MONTH} | range {ds}~{de}\")\n",
    "\n",
    "    page_num = 1\n",
    "    for start in range(1, MAX_SEARCH_PAGES_PER_MONTH * 10, 10):\n",
    "        if len(nat_rows) >= NATIONAL_TARGET_PER_MONTH and len(loc_rows) >= LOCAL_TARGET_PER_MONTH:\n",
    "            break\n",
    "\n",
    "        search_url = f\"{base_search}&start={start}\"\n",
    "        resp = session.get(search_url, timeout=12)\n",
    "        if resp.status_code != 200:\n",
    "            print(f\"  - search page failed (status={resp.status_code}) start={start}\")\n",
    "            time.sleep(SLEEP_BETWEEN_PAGES)\n",
    "            page_num += 1\n",
    "            continue\n",
    "\n",
    "        soup = BeautifulSoup(resp.text, \"html.parser\")\n",
    "        cards = soup.select(\"div.news_area\")\n",
    "        if not cards:\n",
    "            # no results\n",
    "            break\n",
    "\n",
    "        for card in cards:\n",
    "            if len(nat_rows) >= NATIONAL_TARGET_PER_MONTH and len(loc_rows) >= LOCAL_TARGET_PER_MONTH:\n",
    "                break\n",
    "\n",
    "            press_el = card.select_one(\"a.info.press\")\n",
    "            press = clean_text(press_el.get_text(\" \", strip=True)) if press_el else \"\"\n",
    "\n",
    "            # Filter outlets by your manuscript definition\n",
    "            group = None\n",
    "            if press in NATIONAL_OUTLETS:\n",
    "                group = \"national\"\n",
    "            elif press in LOCAL_OUTLETS:\n",
    "                group = \"local\"\n",
    "            else:\n",
    "                continue\n",
    "\n",
    "            # Skip if group quota met\n",
    "            if group == \"national\" and len(nat_rows) >= NATIONAL_TARGET_PER_MONTH:\n",
    "                continue\n",
    "            if group == \"local\" and len(loc_rows) >= LOCAL_TARGET_PER_MONTH:\n",
    "                continue\n",
    "\n",
    "            # Prefer NAVER-hosted article URL\n",
    "            info_links = card.select(\"a.info\")\n",
    "            url = pick_article_url(info_links)\n",
    "            if not url:\n",
    "                continue\n",
    "\n",
    "            # Dedup\n",
    "            if url in seen_month or url in global_seen:\n",
    "                continue\n",
    "            seen_month.add(url)\n",
    "\n",
    "            title, content, ym_parsed, ymd_full, hm = parse_article(session, url)\n",
    "\n",
    "            # Ensure month match (NAVER sometimes returns neighbor dates)\n",
    "            if ym_parsed and ym_parsed != ym:\n",
    "                continue\n",
    "\n",
    "            # Minimal quality filter\n",
    "            if not title or not content:\n",
    "                continue\n",
    "\n",
    "            row = {\n",
    "                \"month\": ym,\n",
    "                \"group\": group,\n",
    "                \"press\": press,\n",
    "                \"url\": url,\n",
    "                \"title\": title,\n",
    "                \"content\": content,\n",
    "                \"date_ym\": ym_parsed,\n",
    "                \"date_ymd\": ymd_full,\n",
    "                \"time_hm\": hm,\n",
    "            }\n",
    "\n",
    "            if group == \"national\":\n",
    "                nat_rows.append(row)\n",
    "            else:\n",
    "                loc_rows.append(row)\n",
    "\n",
    "            global_seen.add(url)\n",
    "\n",
    "            time.sleep(SLEEP_BETWEEN_REQUESTS)\n",
    "\n",
    "        print(f\"  page {page_num:02d} | nat={len(nat_rows):3d} loc={len(loc_rows):3d} (start={start})\")\n",
    "        time.sleep(SLEEP_BETWEEN_PAGES)\n",
    "        page_num += 1\n",
    "\n",
    "    # If you want strict monthly quotas, you can warn when underfilled\n",
    "    if len(nat_rows) < NATIONAL_TARGET_PER_MONTH or len(loc_rows) < LOCAL_TARGET_PER_MONTH:\n",
    "        print(f\"  [WARN] Underfilled month {ym}: nat={len(nat_rows)}, loc={len(loc_rows)}. Consider increasing MAX_SEARCH_PAGES_PER_MONTH or adding keywords.\")\n",
    "\n",
    "    # Append month results\n",
    "    all_rows.extend(nat_rows)\n",
    "    all_rows.extend(loc_rows)\n",
    "\n",
    "# =========================================================\n",
    "# 4) Save\n",
    "# =========================================================\n",
    "df_out = pd.DataFrame(all_rows)\n",
    "\n",
    "# Optional: enforce column order\n",
    "cols = [\"month\", \"group\", \"press\", \"url\", \"title\", \"content\", \"date_ym\", \"date_ymd\", \"time_hm\"]\n",
    "df_out = df_out[cols]\n",
    "\n",
    "df_out.to_excel(OUT_XLSX, index=False)\n",
    "print(\"\\nSaved:\", OUT_XLSX)\n",
    "print(\"Total rows:\", len(df_out))\n",
    "print(\"Unique URLs:\", df_out[\"url\"].nunique())\n",
    "print(\"Group counts:\\n\", df_out[\"group\"].value_counts(dropna=False))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
